library(ncdf4)
library(ggplot2)
require(data.table)

#**********************
#NOTE: this script prepares intermediate yield and climate data (per crop grid cell) to be used in other R scripts
#you do not need to rerun this script, just use the output Rdata: GridSample-GrSnClimate-Data-final.Rdata and GridSample-SSP2-SSP5-Data-final.Rdata
#some raw yield and climate data below are too large for the Harvard Dataverse Repository, 
#if you need these raw data, please contact the corresponding author. 
#**********************

#==edit your local path to Replication Data for: Nature Food paper NATFOOD-20091526
wd <- "C:\\Users\\yfan\\...\\NFood_replication\\data"


crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")
pft = c(17,18,  19,20,   23,24,    41,42,    61,62,   67,68,   75,76, 77,78)#rainfed vs. irrigated
case = c("RCP45","RCP45_85LU","RCP85","RCP85_45CO2","RCP85_SAI","RCP85_MSB","RCP85_CCT")
casename = c("RCP45",expression('RCP45'[(SSP5LU)]),"RCP85",expression('RCP85'[(RCP45CO2)]),"RCP85+SAI","RCP85+MSB","RCP85+CCT")



#===================== Grid cell sampling data for regression===============================

#==============Read in crop yield and climate data 
y.45 <- nc_open(paste0(wd, "/yield_data/RCP45_crop_data.2006-2100.nc"))
y.45_85lu <- nc_open(paste0(wd, "/yield_data/RCP45_85LU_crop_data.2006-2100.nc"))
y.85 <- nc_open(paste0(wd, "/yield_data/RCP85_crop_data.2006-2100.nc"))
y.85_45co2 <- nc_open(paste0(wd, "/yield_data/RCP85_45CO2_crop_data.2006-2100.nc"))
y.sai <- nc_open(paste0(wd, "/yield_data/RCP85_SAI_crop_data.2006-2100.nc"))
y.msb <- nc_open(paste0(wd, "/yield_data/RCP85_MSB_crop_data.2006-2100.nc"))
y.cct <- nc_open(paste0(wd, "/yield_data/RCP85_CCT_crop_data.2006-2100.nc"))

#fertilization rate
##NOTE: unzip the FERTNITRO files under folder /rawdata/FERTNITRO_CFT_SSP2-4.5_SSP5-8.5.zip to /rawdata/*
fertn.ssp2 <- nc_open(paste0(wd, "/yield_data/rawdata/FERTNITRO_CFT_SSP2-4.5_1deg_1850-2100.nc"))
fertn.ssp5 <- nc_open(paste0(wd, "/yield_data/rawdata/FERTNITRO_CFT_SSP5-8.5_1deg_1850-2100.nc"))

#climate variables
wd1<- "C:\\Users\\YFan\\OneDrive - NORCE\\FRAM-NIRD\\FTI\\"
c.85 <- nc_open(paste0(wd1, "/climate_data/RCP85.1deg.clm5.h0.2006-2100.climate.nc")) #surface (2m) climate variables
c.45 <- nc_open(paste0(wd1, "/climate_data/RCP45.1deg.clm5.h0.2006-2100.climate.nc"))
c.sai <- nc_open(paste0(wd1, "/climate_data/RCP85_SAI.1deg.clm5.h0.2020-2100.climate.nc"))
c.msb <- nc_open(paste0(wd1, "/climate_data/RCP85_MSB.1deg.clm5.h0.2020-2100.climate.nc"))
c.cct <- nc_open(paste0(wd1, "/climate_data/RCP85_CCT.1deg.clm5.h0.2020-2100.climate.nc"))
s.85 <- nc_open(paste0(wd1, "/climate_data/SOLAR.RCP85.1deg.clm5.h0.2006-2100.nc"))
s.45 <- nc_open(paste0(wd1, "/climate_data/SOLAR.RCP45.1deg.clm5.h0.2006-2100.nc"))
s.sai <- nc_open(paste0(wd1, "/climate_data/SOLAR.RCP85_SAI.1deg.clm5.h0.2020-2100.nc"))
s.msb <- nc_open(paste0(wd1, "/climate_data/SOLAR.RCP85_MSB.1deg.clm5.h0.2020-2100.nc"))
s.cct <- nc_open(paste0(wd1, "/climate_data/SOLAR.RCP85_CCT.1deg.clm5.h0.2020-2100.nc"))

names(y.85$var)
names(y.85$dim) #dimensions "time" "cft"  "lat"  "lon" 
names(c.85$var)
names(c.85$dim)
names(s.85$var)
v3 <- y.85$var[[3]] #read all info of a variable
v3$name;v3$units;v3$ndims #time is always the last dimension
varsize <- v3$varsize #dim sequence reversed for real data: lon=288 lat=192 cft=16 time=95
rm(v3) 

lon <- ncvar_get(y.85,"lon")
lat <- ncvar_get(y.85,"lat")
cft <- ncvar_get(y.85,"cft")
yrs <- ncvar_get(y.85,"time")



#====Gridded climate data (per unit area, same for all crops within a grid cell)
cft1 = c(17, 19, 23, 41, 61, 67, 75, 77)-15+1 #pft index for active rainfed crops
cft2 = c(18, 20, 24, 42, 62, 68, 76, 78)-15+1 #pft index for active irrigated crops
latmx <- matrix(rep(lat,each=length(lon)),nrow=length(lon),ncol=length(lat)) #matrix of latitude
gridmx <- array(1:(288*192),varsize[1:2]) #matrix of grid index

df.cntl.all<-df.scen.all<-data.table()
df.ssp5.all<-df.ssp2.all<-data.table()
#========climate data: 5 scenarios
scenario <- c("85", "45", "sai", "msb", "cct")
#name = c("RCP45 - RCP85", "SAI - RCP85", "MSB - RCP85", "CCT - RCP85")
for (c in 1:5) {
  tsa <- ncvar_get(get(paste0("c.",scenario[c])), "TSA")-273.15 #to celsius
  rh <- ncvar_get(get(paste0("c.",scenario[c])), "RH2M")
  ppt <- ncvar_get(get(paste0("c.",scenario[c])), "RAIN")+ncvar_get(get(paste0("c.",scenario[c])), "SNOW")
  ppt <- ppt*30*3600*24 #mm/s to mm/month
  radd <- ncvar_get(get(paste0("s.",scenario[c])), "FSDSVD") #only vis band is absorbed as PAR
  radi <- ncvar_get(get(paste0("s.",scenario[c])), "FSDSVI") #do not use FSDSND and FSDSNI

  for (n in 1:8) {
    #df.irrig <- df.rainfed <- coeff.irrig<-coeff.rainfed<-data.table()
    for (i in 0:1){
      if (i==0){
        ncft = seq(1,15,2)[n]  #rainfed
        ncft2 = cft1[n]  #cft index in the surface data
        irrigated=FALSE
      }else{
        ncft = seq(2,16,2)[n]  #irrigated
        ncft2 = cft2[n]  #irrigated
        irrigated=TRUE
      }  
      
      #crop area weight per irrig/rainfed crop, all under SSP5
      rcp85.area <- ncvar_get(y.85, "CROP_AREA", start=c(1,1,ncft,1), count=c(varsize[1:2],1,95)) #read 1 crop, 95 year
      rcp85.area[rcp85.area>=9.9692e+36] <- NA #set NA
      #area.m <- rowMeans(area, dims=2, na.rm=FALSE) #mean area of a cft in each grid during 2006-2099
      #sum(array(1, varsize[1:2])[area.m>1000], na.rm=T) # irrigated cft > 1000 ha
      #summary(c(rcp85.area[rcp85.area>1])) #more than 50% cells with < 1000ha area
      
      #====First step: select grid samples, filtering by crop area and yield thresholds (necessary, otherwise not enough memory!)
      #all grid-year samples with >1000ha of a cft or 0.1% of grid cell in RCP85
      samples <- rcp85.area>=1000
      samples[is.na(samples)] <- FALSE
      #summary(c(area[samples]))
      #samples[yield<=0.05] <- FALSE  #cells with 0 means this cft is missing or dead in some years due to LUC or climate
      #do not filter these cells, they may represent crop failure under extreme climate
      samplesize <- apply(samples, 3, function(x)sum(array(1, varsize[1:2])[x], na.rm = T)) #different number of samples per year
      
    
      #========apply area and month mask and get growing season mean climate

      #==Option 1: use crop specific planting date/harvest cycle, see https://escomp.github.io/ctsm-docs/doc/build/html/tech_note/Crop_Irrigation/CLM50_Tech_Note_Crop_Irrigation.html#table-crop-phenology-parameters
      if (crops[n]=="Rice") {
        GrowNH=c(0,1,1,1,1,1,1,0,0,0,0,0) #Rice plants earlier, growing from Feb-Jun
        GrowSH=c(1,0,0,0,0,0,0,1,1,1,1,1)
      } else if (crops[n]=="Sugarcane"){ #sugarcane grows 10 months, planting and harvest dates similar, so use annual climate
        GrowNH=c(1,1,1,1,1,1,1,1,1,1,1,1)
        GrowSH=c(1,1,1,1,1,1,1,1,1,1,1,1)
      } else {
        GrowNH=c(0,0,0,0,1,1,1,1,1,1,0,0) #most crops grow 150 days from April/June to Aug/Oct
        GrowSH=c(1,1,1,1,0,0,0,0,0,0,1,1) #use average May-Oct for NH, Nov-Apr for SH, as in Levis et al. 2016
      }
      
      if (c==1) {
        #CNTL yield
        rcp85.yield = ncvar_get(y.85, "CROP_YIELD", start=c(1,1,ncft,1), count=c(varsize[1:2],1,95))
        rcp85.yield[rcp85.yield>=9.9692e+36] <- NA  #set NA value
        rcp85_45co2.yield = ncvar_get(y.85_45co2, "CROP_YIELD", start=c(1,1,ncft,1), count=c(varsize[1:2],1,95))
        rcp85_45co2.yield[rcp85_45co2.yield>=9.9692e+36] <- NA #set NA
        rcp85.fertn <- ncvar_get(fertn.ssp5, "FERTNITRO_CFT", start=c(1,1,ncft2,157), count=c(varsize[1:2],1,95)) #read 1 crop, 95 year
        rcp85.fertn[rcp85.fertn>=9.9692e+36] <- NA 
        
        df.ssp5 <- data.table(Year=rep(2006:2100,each=288*192)[samples], Scenario="SSP5", Crop=crops[n], Irrig=irrigated,
                              yield=c(rcp85.yield[samples]), area= c(rcp85.area[samples]),
                              fertn=c(rcp85.fertn[samples]), Sample=rep(gridmx, time=95)[samples])
        df.ssp5.all <- rbind(df.ssp5.all, df.ssp5) #save for land use analysis
        
        cntl.y <- data.table(Year=rep(2006:2100,each=288*192)[samples], 
                             yield0=c(rcp85.yield[samples]), area= c(rcp85.area[samples]), yield1=c(rcp85_45co2.yield[samples]),
                             fertn=c(rcp85.fertn[samples]), Sample=rep(gridmx, time=95)[samples])
        #==each crop has specific growing season climate
        samples.m <- samples[,,rep(1:95, each=12)] #annual data to monthly for calc growing season mean climate (yield and fertn are still annual)
        cntl.c= data.table(ppt=c(ppt[samples.m]), tsa=c(tsa[samples.m]), #2m air temperature
                         rh=c(rh[samples.m]), #q2m=c(q2m[samples.m]), wind=c(wind[samples.m]), sw=c(sw[samples.m]),
                         radd=c(radd[samples.m]), radi=c(radi[samples.m]),
                         Year=rep(rep(2006:2100,each=12),each=288*192)[samples.m], Month=rep(rep(1:12,time=95),each=288*192)[samples.m],
                         GrowingN=rep(rep(GrowNH,time=95),each=288*192)[samples.m], #growing season filter: April-Oct (use LAI>0.5)
                         GrowingS=rep(rep(GrowSH,time=95),each=288*192)[samples.m], #growing season filter: (most crops have only 5 months growing season)
                         Sample=rep(gridmx, time=95*12)[samples.m], Lat=rep(latmx, time=95*12)[samples.m])
        
      } else if (c==2) { #ER scenario
        #====original land use data (yield,area,fertn)
        #RCP45 original land use SSP2
        rcp45.area <- ncvar_get(y.45, "CROP_AREA", start=c(1,1,ncft,1), count=c(varsize[1:2],1,95)) #read 1 crop, 95 year
        rcp45.area[rcp45.area>=9.9692e+36] <- NA #set NA value
        rcp45.yield <- ncvar_get(y.45, "CROP_YIELD", start=c(1,1,ncft,1), count=c(varsize[1:2],1,95)) #read 1 crop, 95 year
        rcp45.yield[rcp45.yield>=9.9692e+36] <- NA #set NA
        rcp45.fertn <- ncvar_get(fertn.ssp2, "FERTNITRO_CFT", start=c(1,1,ncft2,157), count=c(varsize[1:2],1,95)) #years 157:251 = 2006:2100
        rcp45.fertn[rcp45.fertn>=9.9692e+36] <- NA
        
        #note: the mask of SSP2 is different from SSP5; using the same mask SSP5 for the data used for regression and MC resampling, but using original SSP2 data for total land use effect and crop production change
        samples.SSP2 <- rcp45.area>=1000 
        samples.SSP2[is.na(samples.SSP2)] <- FALSE
        #==use this to calc total luc.eff and Y.mod.dif, P.mod.dif
        df.ssp2 <- data.table(Year=rep(2006:2100,each=288*192)[samples.SSP2], Scenario="SSP2", Crop=crops[n], Irrig=irrigated,
                              yield=c(rcp45.yield[samples.SSP2]), area= c(rcp45.area[samples.SSP2]),
                              fertn=c(rcp45.fertn[samples.SSP2]), Sample=rep(gridmx, time=95)[samples.SSP2]) #different mask from SSP5
        df.ssp2.all <- rbind(df.ssp2.all, df.ssp2) 
        
        #==use below for regression analysis and MC resampling based on differencing scen - cntl
        #RCP45_85LU with same land use as RCP85
        yield <- ncvar_get(get(paste0("y.45_85lu")), "CROP_YIELD", start=c(1,1,ncft,1), count=c(-1,-1,1,-1)) #read 1 crop
        area <- ncvar_get(get(paste0("y.45_85lu")), "CROP_AREA", start=c(1,1,ncft,1), count=c(-1,-1,1,-1))
        yield[yield>=9.9692e+36] <- NA #set NA
        area[area>=9.9692e+36] <- NA #set NA
        scen.y <- data.table(Year=rep(2006:2100,each=288*192)[samples],
                             yield=c(yield[samples]), area=c(area[samples]), 
                             yield2=c(rcp45.yield[samples]), area2=c(rcp45.area[samples]), #the mask of SSP5 applied to RCP45 data, used only for regression and resampling because of differencing RCP45-RCP85 (log(yield2)-log(yield))
                             fertn2=c(rcp45.fertn[samples]), Sample=rep(gridmx, time=95)[samples])
        samples.m <- samples[,,rep(1:95, each=12)] #annual to monthly
        scen.c <- data.table(tsa=c(tsa[samples.m]), ppt=c(ppt[samples.m]), rh=c(rh[samples.m]),
                         radd=c(radd[samples.m]), radi=c(radi[samples.m]),
                         Year=rep(rep(2006:2100,each=12),each=288*192)[samples.m], Month=rep(rep(1:12,time=95),each=288*192)[samples.m],
                         GrowingN=rep(rep(GrowNH,time=95),each=288*192)[samples.m], #growing season filter: April-Oct (use LAI>0.5)
                         GrowingS=rep(rep(GrowSH,time=95),each=288*192)[samples.m], #growing season filter: (most crops have only 5 months growing season)
                         Sample=rep(gridmx, time=95*12)[samples.m], Lat=rep(latmx, time=95*12)[samples.m] )
        
      }else{ #RFGs: SAI/MSB/CCT
        yield <- ncvar_get(get(paste0("y.",scenario[c])), "CROP_YIELD", start=c(1,1,ncft,15), count=c(-1,-1,1,81)) #read years 2020:2100
        area  <- ncvar_get(get(paste0("y.",scenario[c])), "CROP_AREA", start=c(1,1,ncft,15), count=c(-1,-1,1,81)) #read years 2020:2100
        yield[yield>=9.9692e+36] <- NA #set NA
        area[area>=9.9692e+36] <- NA #set NA
        
        #note SAI/MSB/CCT start from 2020, the 15th item in time dimension
        scen.y <- data.table(Year=rep(2020:2100,each=288*192)[samples[,,15:95]], yield=c(yield[samples[,,15:95]]), area=c(area[samples[,,15:95]]), 
                             Sample=rep(gridmx, time=81)[samples[,,15:95]])
        samples.m <- samples[,,rep(15:95, each=12)] #note take 15:95 for years 2020:2100
        scen.c <- data.table(tsa=c(tsa[samples.m]), ppt=c(ppt[samples.m]), rh=c(rh[samples.m]),
                         radd=c(radd[samples.m]), radi=c(radi[samples.m]),
                         radd=c(radd[samples.m]), radi=c(radi[samples.m]),
                         Year=rep(rep(2020:2100,each=12),each=288*192)[samples.m], Month=rep(rep(1:12,time=81),each=288*192)[samples.m],
                         GrowingN=rep(rep(GrowNH,time=81),each=288*192)[samples.m], #growing season filter: April-Oct (use LAI>0.5)
                         GrowingS=rep(rep(GrowSH,time=81),each=288*192)[samples.m], #growing season filter: (most crops have only 5 months growing season)
                         Sample=rep(gridmx, time=81*12)[samples.m], Lat=rep(latmx, time=81*12)[samples.m] )
      }
      
      #====growing season climate data====
      if (c==1){ #CNTL
        
        #===Option 1: calculate growing season climate for ech crop according to their planting date and max harvest date
        cntl.NH <- cntl.c[Lat>=0,.(tsa.m=mean(tsa), #tmx.m=mean(tmx), tmn.m=mean(tmn), #tsa.m=mean(tsa), sw.m=mean(sw), wind.m=mean(wind),
                                   rh.m=mean(rh), ppt.m=mean(ppt),radd.m=mean(radd),radi.m=mean(radi)), by=.(Sample,Year,GrowingN)] #growing season mean using NH filter
        cntl.SH <- cntl.c[Lat<0,.(tsa.m=mean(tsa), #tmx.m=mean(tmx), tmn.m=mean(tmn), #tbot.m=mean(tbot), qbot.m=mean(qbot), sw.m=mean(sw), wind.m=mean(wind),
                                  rh.m=mean(rh), ppt.m=mean(ppt),radd.m=mean(radd),radi.m=mean(radi)), by=.(Sample,Year,GrowingS)] #SH filter
        cntl.G1 <- rbind(cntl.NH[GrowingN==1], cntl.SH[GrowingS==1], fill=T)
        cntl.G1 <- cntl.G1[order(Sample,Year),]
        cntl.G1 <- cntl.G1[cntl.y, on=.(Sample,Year)] #merge yield and climate data
        cntl.G1$Scenario=scenario[c]
        cntl.G1$Crop=crops[n]
        cntl.G1$Irrig=irrigated

        ##=====merge output data of all crop types (8x2) 
        df.cntl.all <- rbind(df.cntl.all, cntl.G1) #all raw yield and climate data (growing season accoring to planting date)
        
      }else{ #RFG and ER scenarios
        
        #===Option 1: calculate growing season climate for ech crop according to their planting date and max harvest date
        #NOTE: planting dates of Southern Hemisphere are 6 months later than Northern Hemisphere
        scen.NH <- scen.c[Lat>=0,.(tsa.m=mean(tsa),rh.m=mean(rh),ppt.m=mean(ppt),radd.m=mean(radd),radi.m=mean(radi)), by=.(Sample,Year,GrowingN)] #growing season mean using NH filter
        scen.SH <- scen.c[Lat<0,.(tsa.m=mean(tsa),rh.m=mean(rh),ppt.m=mean(ppt),radd.m=mean(radd),radi.m=mean(radi)), by=.(Sample,Year,GrowingS)] #SH filter
        scen.G1 <- rbind(scen.NH[GrowingN==1], scen.SH[GrowingS==1], fill=T)
        scen.G1 <- scen.G1[order(Sample,Year),]

        scen.G1 <- scen.G1[scen.y, on=.(Sample,Year)] #merge annual yield and climate data
        scen.G1$Scenario=scenario[c]
        scen.G1$Crop=crops[n]
        scen.G1$Irrig=irrigated

        ##=====merge output data of all crop types (8x2) and scenarios (1+4)
        df.scen.all <- rbind(df.scen.all, scen.G1, fill=TRUE)
      }
      
      #release memory for large array
      rm(scen.y,cntl.y,scen.c,cntl.c,scen.G1,cntl.G1,scen.NH,scen.SH,cntl.SH,cntl.NH)
      rm(yield, area, rcp45.area,rcp45.yield, rcp45.fertn, rcp85.area,rcp85.yield,rcp85.fertn, rcp85_45co2.yield)# ,growing,samples.m,samples)
    }#end irrig loop
  }#end crop loop

  rm(tsa,rh,ppt,radd,radi) #q2m,tbot,qbot,wind,sw,lai,tmn,tmx
  rm(samples, samples.SSP2, samples.m)
}#end case loop

df.cntl.all <- df.cntl.all[Year !=2100]
df.scen.all <- df.scen.all[Year !=2100]
df.ssp5.all <- df.ssp5.all[Year !=2100]
df.ssp2.all <- df.ssp2.all[Year !=2100]

nc_close(y.85);nc_close(y.45);nc_close(y.85_45co2);nc_close(y.sai);nc_close(y.msb);nc_close(y.cct);nc_close(fertn.ssp2);nc_close(fertn.ssp5)
nc_close(c.85);nc_close(c.45);nc_close(c.sai);nc_close(c.msb);nc_close(c.cct)
nc_close(s.85);nc_close(s.45);nc_close(s.sai);nc_close(s.msb);nc_close(s.cct)  
rm(s.45,s.85,s.msb,s.cct,s.sai,y.45,y.45_85lu,y.85,y.85_45co2,y.msb,y.cct,y.sai,c.45,c.85,c.msb,c.cct,c.sai,fertn.ssp2,fertn.ssp5)

#check object size
# format(object.size(cntl.c), units="Gb")
# format(object.size(df.cntl.all), units="Gb")

save(list = c("df.cntl.all","df.scen.all"), file =paste0(wd,"/Rdata/GridSample-GrSnClimate-Data-final.Rdata"))
save(list = c("df.ssp5.all","df.ssp2.all"), file =paste0(wd,"/Rdata/GridSample-SSP2-SSP5-Data-final.Rdata"))












